df <- read.csv("data/eventbride_18_04_14.csv", comment.char="#", stringsAsFactors=FALSE)keeps <- c('faz.net',"focus.de",
"handelsblatt.com","n-tv.de","spiegel.de",
"stern.de","sueddeutsche.de",
"tagesschau.de", "welt.de", "zeit.de")
df <- df %>%
mutate(text = body,
# Extract site
site = str_extract(source, "(?<='uri': ')[A-z][^']*"),
date = as.Date(date)) %>%
select(date,title,text,site,url,isDuplicate) %>%
filter(site %in% keeps) %>%
mutate(title_text = paste(title, text, sep=" "))
df_facebook <- df %>%
filter(grepl("facebook",title, ignore.case = T))# Calculate text length (number of words)
df_facebook$text_length <- sapply(gregexpr("\\S+",
df_facebook$text), length)ggplot(df_facebook, aes(text_length, group=site,
color=site)) +
geom_density() +
labs(x="", title = "Word count", color = "")# Filtering
df_facebook <- df_facebook %>%
filter(text_length > 100) %>%
# remove articles that contain daily overviews
filter(!grepl("Nachrichten am Morgen", title)) %>%
filter(!grepl("Der Morgen live", title)) %>%
filter(!grepl("Die Lage am", title)) %>%
filter(!startsWith(title,"News")) %>%
# remove articles that only contain video
filter(!grepl("Video einbetten Nutzungsbedingungen Embedding Tagesschau", title_text)) %>%
filter(!grepl("</div>", title_text)) %>%
# remove text that mostly contain user comments
filter(!startsWith(text,"1.")) %>%
# remove articles behind a pay-wall
filter(!grepl("SPIEGEL-Plus-Artikel", text)) ggplot(df_facebook, aes(site)) +
geom_bar(fill = col[3], alpha=0.7) +
labs(x="", title = "Number of Articles") +
theme(axis.text.x = element_text(angle = 60, size=10))stem_text<- function(text, language = "porter", mc.cores = 1) {
# stem each word in a block of text
stem_string <- function(str, language) {
str <- strsplit(x = str, split = "\\s")
str <- SnowballC::wordStem(unlist(str), language = language)
str <- paste(str, collapse = " ")
return(str)
}
# stem each text block in turn
x <- mclapply(X = text, FUN = stem_string, language, mc.cores = mc.cores)
# return stemed text blocks
return(unlist(x))
}
df_facebook$text_cleaned <- stem_text(df_facebook$text_cleaned)token <- df_facebook %>%
group_by(site) %>%
unnest_tokens(word, text_cleaned) %>%
dplyr::count(site, word, sort = TRUE) %>%
bind_tf_idf(word, site, n) %>%
dplyr::arrange(desc(tf_idf))
token %>%
arrange(desc(tf)) %>%
arrange(site) %>%
top_n(5) %>%
knitr::kable(align = "l")| site | word | n | tf | idf | tf_idf |
|---|---|---|---|---|---|
| faz.net | amerikanischen | 43 | 0.0042194 | 0.3566749 | 0.0015050 |
| faz.net | palantir | 23 | 0.0022569 | 0.6931472 | 0.0015644 |
| faz.net | procter | 10 | 0.0009813 | 2.3025851 | 0.0022594 |
| faz.net | gambl | 6 | 0.0005888 | 2.3025851 | 0.0013557 |
| faz.net | hässig | 6 | 0.0005888 | 2.3025851 | 0.0013557 |
| focus.de | iri | 10 | 0.0014618 | 2.3025851 | 0.0033659 |
| focus.de | malter | 10 | 0.0014618 | 2.3025851 | 0.0033659 |
| focus.de | modamani | 6 | 0.0008771 | 2.3025851 | 0.0020195 |
| focus.de | darm | 5 | 0.0007309 | 2.3025851 | 0.0016829 |
| focus.de | gaffer | 5 | 0.0007309 | 2.3025851 | 0.0016829 |
| focus.de | jobcent | 5 | 0.0007309 | 2.3025851 | 0.0016829 |
| handelsblatt.com | ap | 28 | 0.0014968 | 2.3025851 | 0.0034466 |
| handelsblatt.com | rtr | 25 | 0.0013365 | 2.3025851 | 0.0030773 |
| handelsblatt.com | hutter | 20 | 0.0010692 | 1.6094379 | 0.0017208 |
| handelsblatt.com | snb | 16 | 0.0008553 | 2.3025851 | 0.0019695 |
| handelsblatt.com | ma | 14 | 0.0007484 | 2.3025851 | 0.0017233 |
| n-tv.de | skript | 8 | 0.0014278 | 1.6094379 | 0.0022980 |
| n-tv.de | io | 8 | 0.0014278 | 1.2039728 | 0.0017190 |
| n-tv.de | datenriesen | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| n-tv.de | drastischer | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| n-tv.de | motherboard | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| n-tv.de | nonsen | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| n-tv.de | smilei | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| n-tv.de | synchronisierung | 4 | 0.0007139 | 2.3025851 | 0.0016438 |
| spiegel.de | netzwelt | 19 | 0.0025631 | 2.3025851 | 0.0059017 |
| spiegel.de | sonnenfeld | 17 | 0.0022933 | 2.3025851 | 0.0052804 |
| spiegel.de | anja | 8 | 0.0010792 | 2.3025851 | 0.0024849 |
| spiegel.de | kregeloh | 6 | 0.0008094 | 2.3025851 | 0.0018637 |
| spiegel.de | regina | 6 | 0.0008094 | 2.3025851 | 0.0018637 |
| stern.de | polizei | 25 | 0.0021901 | 1.2039728 | 0.0026368 |
| stern.de | skripal | 23 | 0.0020149 | 1.6094379 | 0.0032428 |
| stern.de | museum | 16 | 0.0014017 | 2.3025851 | 0.0032275 |
| stern.de | malt | 13 | 0.0011389 | 2.3025851 | 0.0026223 |
| stern.de | mansholt | 13 | 0.0011389 | 2.3025851 | 0.0026223 |
| sueddeutsche.de | mackai | 8 | 0.0010275 | 2.3025851 | 0.0023659 |
| sueddeutsche.de | stiftung | 7 | 0.0008990 | 1.6094379 | 0.0014470 |
| sueddeutsche.de | lehman | 6 | 0.0007706 | 2.3025851 | 0.0017744 |
| sueddeutsche.de | live | 5 | 0.0006422 | 2.3025851 | 0.0014787 |
| sueddeutsche.de | matter | 5 | 0.0006422 | 2.3025851 | 0.0014787 |
| tagesschau.de | studio | 8 | 0.0031471 | 2.3025851 | 0.0072465 |
| tagesschau.de | gebhart | 6 | 0.0023603 | 2.3025851 | 0.0054349 |
| tagesschau.de | werbeindustri | 4 | 0.0015736 | 1.2039728 | 0.0018945 |
| tagesschau.de | marcu | 3 | 0.0011802 | 2.3025851 | 0.0027174 |
| tagesschau.de | schuler | 3 | 0.0011802 | 2.3025851 | 0.0027174 |
| welt.de | verimi | 21 | 0.0012861 | 2.3025851 | 0.0029613 |
| welt.de | nast | 13 | 0.0007961 | 2.3025851 | 0.0018332 |
| welt.de | wambach | 11 | 0.0006736 | 2.3025851 | 0.0015511 |
| welt.de | unheimlich | 10 | 0.0006124 | 2.3025851 | 0.0014101 |
| welt.de | eigentum | 8 | 0.0004899 | 2.3025851 | 0.0011281 |
| zeit.de | garcía | 9 | 0.0008724 | 2.3025851 | 0.0020088 |
| zeit.de | martínez | 9 | 0.0008724 | 2.3025851 | 0.0020088 |
| zeit.de | wenger | 8 | 0.0007755 | 2.3025851 | 0.0017856 |
| zeit.de | elbvertiefung | 7 | 0.0006786 | 2.3025851 | 0.0015624 |
| zeit.de | picabia | 6 | 0.0005816 | 2.3025851 | 0.0013392 |
bigrams <- df_facebook %>%
unnest_tokens(bigram, text_cleaned, token="ngrams", n=2)
bigrams %>%
group_by(site) %>%
count(bigram) %>%
arrange(desc(n)) %>%
top_n(5) %>%
knitr::kable(align = "l")## Selecting by n
| site | bigram | n |
|---|---|---|
| handelsblatt.com | cambridg analytica | 173 |
| welt.de | cambridg analytica | 132 |
| zeit.de | cambridg analytica | 104 |
| handelsblatt.com | mark zuckerberg | 97 |
| faz.net | cambridg analytica | 88 |
| sueddeutsche.de | cambridg analytica | 70 |
| spiegel.de | cambridg analytica | 68 |
| welt.de | mark zuckerberg | 68 |
| handelsblatt.com | facebook chef | 59 |
| handelsblatt.com | millionen facebook | 56 |
| focus.de | cambridg analytica | 55 |
| stern.de | cambridg analytica | 55 |
| handelsblatt.com | daten millionen | 54 |
| n-tv.de | cambridg analytica | 47 |
| zeit.de | mark zuckerberg | 47 |
| welt.de | facebook chef | 42 |
| faz.net | mark zuckerberg | 39 |
| welt.de | donald trump | 39 |
| stern.de | mark zuckerberg | 35 |
| sueddeutsche.de | mark zuckerberg | 33 |
| zeit.de | millionen facebook | 33 |
| n-tv.de | mark zuckerberg | 32 |
| welt.de | daten millionen | 32 |
| zeit.de | daten millionen | 32 |
| faz.net | sozial netzwerk | 30 |
| zeit.de | facebook chef | 30 |
| sueddeutsche.de | facebook chef | 28 |
| faz.net | facebook nutzern | 26 |
| faz.net | millionen facebook | 26 |
| spiegel.de | mark zuckerberg | 26 |
| sueddeutsche.de | daten millionen | 26 |
| tagesschau.de | cambridg analytica | 26 |
| focus.de | mark zuckerberg | 25 |
| n-tv.de | facebook chef | 24 |
| focus.de | donald trump | 23 |
| n-tv.de | daten millionen | 23 |
| stern.de | facebook chef | 23 |
| sueddeutsche.de | u kongress | 22 |
| stern.de | donald trump | 21 |
| stern.de | u kongress | 21 |
| focus.de | millionen facebook | 20 |
| spiegel.de | facebook chef | 19 |
| focus.de | daten millionen | 18 |
| focus.de | facebook chef | 18 |
| focus.de | facebook nutzern | 18 |
| n-tv.de | donald trump | 18 |
| spiegel.de | donald trump | 18 |
| tagesschau.de | mark zuckerberg | 15 |
| tagesschau.de | facebook chef | 12 |
| spiegel.de | datenskand facebook | 10 |
| spiegel.de | millionen facebook | 10 |
| tagesschau.de | daten millionen | 10 |
| tagesschau.de | donald trump | 9 |
| tagesschau.de | facebook nutzern | 9 |
| tagesschau.de | millionen facebook | 9 |
library(quanteda)
all.corpus <- corpus(df_facebook$text_cleaned)
df.corpus <- dfm(all.corpus)
textplot_wordcloud(df.corpus, max.word=200,
colors = col)pacman::p_load(sentimentr)# Load dictionaries (from: http://wortschatz.uni-leipzig.de/de/download)
neg_df <- read_tsv("dict/SentiWS_v1.8c_Negative.txt", col_names = FALSE)
pos_df <- read_tsv("dict/SentiWS_v1.8c_Positive.txt", col_names = FALSE)
sentiment_df <- bind_rows(neg_df,pos_df)
names(sentiment_df) <- c("Wort_POS", "polarity", "Inflektionen")
sentiment_df %>%
mutate(words = str_sub(Wort_POS, 1, regexpr("\\|", .$Wort_POS)-1),
words = tolower(words)
#POS = str_sub(Wort_POS, start = regexpr("\\|", .$Wort_POS)+1)
) %>%
select(words, polarity) -> sentiment_df
sentiment_df <- rbind(sentiment_df, c("nicht",-0.8))
sentiment_df %>% mutate(polarity = as.numeric(polarity)) %>%
as_key() -> sentiment_dfWe may wish to see the output from sentiment_by line by line with positive/negative sentences highlighted. The highlight function wraps a sentiment_by output to produces a highlighted HTML file (positive = green; negative = pink). Lets have a look at random articles here.
Lets apply this on the whole corpus.
sent_df <- df_facebook %>%
mutate(split = get_sentences(title_text)) %$%
sentiment(split,
polarity_dt = sentiment_df)
df_facebook$element_id <- as.numeric(rownames(df_facebook))
df_facebook <- left_join(df_facebook, sent_df %>%
select(element_id, sentiment),
by="element_id")df_facebook %>%
group_by(site) %>%
mutate(ave_sentiment = mean(sentiment)) -> plotp1 <- plot %>%
ggplot(aes(sentiment, site, text=title)) +
geom_point(color="blue", alpha=.5, shape=1) +
geom_point(aes(ave_sentiment, site), color="red", size=2) +
xlim(c(-0.3,0.3)) +
labs(y="")
plotly::ggplotly(p1)